#For data manipulation
import pandas as pd
#For numerical manipulations
import numpy as np
#For data visualization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
#Machine learning algorithms
import statsmodels.api as sm
from sklearn.model_selection import train_test_split #For Building Train and Test Set
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
#To generate report of data
import pandas_profiling as pd_prof
import warnings
warnings.filterwarnings('ignore')
train = pd.read_csv("C:\\Users\\Administrator\\Downloads\\Hackathon\\train.csv")
train.head(5)
train.shape
train.describe()
import pandas_profiling as pd_prof
pd_prof.ProfileReport(train)
train['Severity'].value_counts()
train['Severity'].replace('Highly_Fatal_And_Damaging',4,inplace=True)
train['Severity'].replace('Significant_Damage_And_Fatalities',3,inplace=True)
train['Severity'].replace('Significant_Damage_And_Serious_Injuries',2,inplace=True)
train['Severity'].replace('Minor_Damage_And_Injuries',1,inplace=True)
train['Severity'].value_counts()
train.describe()
pd_prof.ProfileReport(train)
train.boxplot()
train.corr()['Severity']
import statsmodels.api as sm
from sklearn.model_selection import train_test_split #For Building Train and Test Set
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix, roc_auc_score ,roc_curve,auc
from sklearn.model_selection import GridSearchCV,StratifiedKFold
x = train.drop(['Severity'],axis=1)
y = train[['Severity']]
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.3,random_state=42)
for i in range(20,100):
rfc = RandomForestClassifier(n_estimators=i,random_state=47,bootstrap=False)
rfc.fit(xtrain,ytrain)
pred = rfc.predict(xtest)
score = 100*(f1_score(ytest,pred,average='weighted'))
if score>93:
print(score,i)
test = pd.read_csv("C:\\Users\\Administrator\\Downloads\\Hackathon\\test.csv")
test.head()
test.shape
x = train.drop(['Severity'],axis=1)
y = train[['Severity']]
rfc1 = RandomForestClassifier(n_estimators=98,random_state=47)
rfc.fit(x,y)
pred = rfc.predict(test)
output = pd.DataFrame(zip(test['Accident_ID'],pred))
output.columns=['Accident','Severity']
output.set_index('Accident',inplace=True)
output.to_csv("C:\\Users\\Administrator\\Downloads\\Hackathon\\output1.csv")